Note: Open scripts.Rproj first, then script. To easily use relative paths, click the down button next to knit and then click “Knit Directory –> Project Directory”. This should make loading and saving files much easier.
Perform Differential Gene Expression (DGE) analysis using a read count table (usually from Salmon) and a sample metadata sheet (user made). Red text indicates regions that require the user to modify. Regardless, the user should check over all code blocks to ensure that everything is running correctly.
Load packages
library(edgeR, quietly = TRUE) #edgeR-v3.30.3
library(vegan, quietly = TRUE)
library(Dune, quietly = TRUE)
library(ggplot2, quietly = TRUE) #ggplot2-v3.3.5
library(tidyverse, quietly = TRUE) #tidyverse-v1.3.1
library(ComplexHeatmap, quietly = TRUE)
library(DESeq2, quietly = TRUE)
library(genefilter, quietly = TRUE)
Change file names and conditions where appropriate.
# Input unfiltered data
treatmentinfo.file <- "samples.tsv"
gcount.file <- "salmon.numreads.tsv"
# Output DEG results
DGE_results.file <- "salmon.numreads.DGE_results.tsv"
DGE_samples.file <- "salmon.numreads.DGE_samples.tsv"
# Treatment conditions and variables to consider
condition <- "treatment"
treatments.to.compare <- c("HTAC", "ATAC")
treatment.colors <- c("ATAC" = "#1b9e77", "HTAC" = "#d95f02")
# pOverA gene filtering thresholds (only process genes with over these cutoffs)
pOverA.cutoff.P <- 0.8 # >80% of samples
pOverA.cutoff.A <- 10 # >10 read counts
# DESeq2 results filtering thresholds (only report results below these cutoffs)
lfcThreshold.cutoff <- log2(1.5) # Log threshold to use for DESeq2 results filtering
padj.cutoff <- 0.05 # Adjusted p-value threshold to use for DESeq2 results filtering
Load the input file containing the treatment information.
treatmentinfo <- read.csv(treatmentinfo.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM") # Read in file
head(treatmentinfo)
## sample_id unit lib_type fq1 fq2 species plug_id
## 1 Pacuta_ATAC_TP1_1043 1 pe SRR14610932 SRR14610932 Pacuta 1043
## 2 Pacuta_ATAC_TP1_1775 1 pe SRR14610931 SRR14610931 Pacuta 1775
## 3 Pacuta_ATAC_TP1_2363 1 pe SRR14610930 SRR14610930 Pacuta 2363
## 4 Pacuta_ATAC_TP3_1041 1 pe SRR14610929 SRR14610929 Pacuta 1041
## 5 Pacuta_ATAC_TP3_1471 1 pe SRR14610928 SRR14610928 Pacuta 1471
## 6 Pacuta_ATAC_TP3_1637 1 pe SRR14610927 SRR14610927 Pacuta 1637
## treatment timepoint reef ploidy group
## 1 ATAC 1 Lilipuna.Fringe 3 Group4
## 2 ATAC 1 Reef.42.43 2 Group5
## 3 ATAC 1 Reef.18 3 Group3
## 4 ATAC 3 Reef.11.13 3 Group2
## 5 ATAC 3 Reef.35.36 3 Group3
## 6 ATAC 3 Reef.11.13 3 Group2
# Check we have the right column names
headers <- c("sample_id", condition)
if( all(headers %in% colnames(treatmentinfo)) ){
cat(paste(treatmentinfo.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
stop(paste(treatmentinfo.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## samples.tsv has the required columns: sample_id, treatment
Filter treatmentinfo to just the samples that we want to
analyze. Adjust filtering as
required.
treatmentinfo <- treatmentinfo %>%
filter(!!rlang::sym(condition) %in% treatments.to.compare & timepoint=="8") %>%
mutate({{condition}} := factor(!!rlang::sym(condition), levels = treatments.to.compare))
head(treatmentinfo)
## sample_id unit lib_type fq1 fq2 species plug_id
## 1 Pacuta_ATAC_TP8_1051 1 pe SRR14610912 SRR14610912 Pacuta 1051
## 2 Pacuta_ATAC_TP8_1755 1 pe SRR14610911 SRR14610911 Pacuta 1755
## 3 Pacuta_ATAC_TP8_2012 1 pe SRR14610910 SRR14610910 Pacuta 2012
## 4 Pacuta_HTAC_TP8_1329 1 pe SRR14611019 SRR14611019 Pacuta 1329
## 5 Pacuta_HTAC_TP8_1765 1 pe SRR14611018 SRR14611018 Pacuta 1765
## 6 Pacuta_HTAC_TP8_2513 1 pe SRR14611017 SRR14611017 Pacuta 2513
## treatment timepoint reef ploidy group
## 1 ATAC 8 Reef.11.13 3 Group1
## 2 ATAC 8 Reef.42.43 2 Group5
## 3 ATAC 8 Reef.42.43 3 Group2
## 4 HTAC 8 Lilipuna.Fringe 2 Group6
## 5 HTAC 8 Reef.42.43 3 Group3
## 6 HTAC 8 Reef.18 3 Group2
Load the input file containing the gene count matrix.
gcount <- as.data.frame(read.csv(gcount.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM")) # Read in file
# Check we have the right column names
headers <- c("Name", treatmentinfo$sample_id)
if( all(headers %in% colnames(gcount)) ){
cat(paste(gcount.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
stop(paste(gcount.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## salmon.numreads.tsv has the required columns: Name, Pacuta_ATAC_TP8_1051, Pacuta_ATAC_TP8_1755, Pacuta_ATAC_TP8_2012, Pacuta_HTAC_TP8_1329, Pacuta_HTAC_TP8_1765, Pacuta_HTAC_TP8_2513
# Cleanup gcount data
gcount <- gcount %>%
column_to_rownames("Name") %>% # Makes "Name" column rownames
round() %>% # Round
select(treatmentinfo$sample_id) # Select just columns in filtered treatmentinfo file
# View dataset attributes
d <- dim(gcount); print(paste("rows:",d[[1]]," columns:",d[[2]], sep=''))
## [1] "rows:33259 columns:6"
head(gcount)[,1:3]
## Pacuta_ATAC_TP8_1051
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1 51
## Pocillopora_acuta_KBHIv2___TS.g16868.t1 472
## Pocillopora_acuta_KBHIv2___RNAseq.g25384.t1 1
## Pocillopora_acuta_KBHIv2___RNAseq.g30122.t1 0
## Pocillopora_acuta_KBHIv2___RNAseq.g12817.t1 0
## Pocillopora_acuta_KBHIv2___TS.g9097.t1 140
## Pacuta_ATAC_TP8_1755
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1 30
## Pocillopora_acuta_KBHIv2___TS.g16868.t1 364
## Pocillopora_acuta_KBHIv2___RNAseq.g25384.t1 0
## Pocillopora_acuta_KBHIv2___RNAseq.g30122.t1 0
## Pocillopora_acuta_KBHIv2___RNAseq.g12817.t1 1
## Pocillopora_acuta_KBHIv2___TS.g9097.t1 24
## Pacuta_ATAC_TP8_2012
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1 51
## Pocillopora_acuta_KBHIv2___TS.g16868.t1 433
## Pocillopora_acuta_KBHIv2___RNAseq.g25384.t1 14
## Pocillopora_acuta_KBHIv2___RNAseq.g30122.t1 0
## Pocillopora_acuta_KBHIv2___RNAseq.g12817.t1 0
## Pocillopora_acuta_KBHIv2___TS.g9097.t1 127
Filter gcount to just genes with enough data across
columns (exclude genes which are mostly zeros)
# Create filter for the counts data
filt <- filterfun(pOverA(pOverA.cutoff.P, pOverA.cutoff.A))
gfilt <- genefilter(gcount, filt)
# Identify genes to keep by count filter
keep <- gcount[gfilt,]
# Identify gene lists
keep <- rownames(keep)
# Gene count data filtered in PoverA, P percent of the samples have counts over A
gcount_filt <- as.data.frame(gcount[which(rownames(gcount) %in% keep),])
d <- dim(gcount); print(paste("gcount - rows:",d[[1]]," columns:",d[[2]], sep='')) # Before filtering
## [1] "gcount - rows:33259 columns:6"
d <- dim(gcount_filt); print(paste("gcount_filt - rows:",d[[1]]," columns:",d[[2]], sep='')) # After filtering
## [1] "gcount_filt - rows:15583 columns:6"
Create a DESeqDataSet design from gene count matrix and labels. Here
we set the design to look at the column listed in the
condition variable.
gdds <- DESeqDataSetFromMatrix(countData = as.data.frame(gcount_filt),
colData = treatmentinfo,
design = as.formula(paste("~",condition, sep=''))
)
## converting counts to integer mode
Run differential expression test using a Wald model.
DEG <- DESeq(gdds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Explore significant p-values for condition pairs
# Extract DESeq2 results
DEG.results <- results(DEG, contrast= c(condition, treatments.to.compare), lfcThreshold=lfcThreshold.cutoff)
head(DEG.results)
## log2 fold change (MLE): treatment HTAC vs ATAC
## Wald test p-value: treatment HTAC vs ATAC
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1 51.4931 0.426436 0.423765
## Pocillopora_acuta_KBHIv2___TS.g16868.t1 368.5623 -0.470140 0.343510
## Pocillopora_acuta_KBHIv2___TS.g9097.t1 81.9351 -0.520940 0.596254
## Pocillopora_acuta_KBHIv2___RNAseq.g17538.t1 29.0177 0.722860 0.627746
## Pocillopora_acuta_KBHIv2___RNAseq.g4713.t1 584.8922 -3.112965 0.798367
## Pocillopora_acuta_KBHIv2___RNAseq.g19105.t1 46.5179 -0.453354 0.390970
## stat pvalue padj
## <numeric> <numeric> <numeric>
## Pocillopora_acuta_KBHIv2___RNAseq.g16115.t1 0.000000 1.00000000 1.000000
## Pocillopora_acuta_KBHIv2___TS.g16868.t1 0.000000 1.00000000 1.000000
## Pocillopora_acuta_KBHIv2___TS.g9097.t1 0.000000 1.00000000 1.000000
## Pocillopora_acuta_KBHIv2___RNAseq.g17538.t1 0.219671 0.82612729 NA
## Pocillopora_acuta_KBHIv2___RNAseq.g4713.t1 -3.166467 0.00154303 0.364046
## Pocillopora_acuta_KBHIv2___RNAseq.g19105.t1 0.000000 1.00000000 1.000000
# Filter results
DEGs <- as.data.frame(subset(DEG.results, padj<padj.cutoff))
nrow(DEGs)
## [1] 25
# Order p-values by smallest value first
results.ordered <- order(DEGs$padj)
# Make row names a column before writing to file.
DEGs$gene_id <- rownames(DEGs)
rownames(DEGs) <- NULL
DEGs <- relocate(DEGs, "gene_id", .before="baseMean")
# Write filtered DGE results and samples
write.table(DEGs, DGE_results.file, sep='\t', row.names=F, quote=F)
write.table(treatmentinfo, DGE_samples.file, sep='\t', row.names=F, quote=F)
We will now transform them with cpm for plotting, we will also subset the gene count matrix by the list of DEGs.
DEGlist <- gdds[DEGs$gene_id, ]
Apply a variance stabilizing transformation to minimize effects of small counts and normalize by library size
# Get size factor for each sample
sf <- estimateSizeFactors(gdds)$sizeFactor
print(paste("Max size factor: ",max(sf), sep=''))
## [1] "Max size factor: 1.85902608445539"
print(paste("Min size factor: ",min(sf), sep=''))
## [1] "Min size factor: 0.75064280748646"
# If above max(size factors) is less than 4, we can use VST.
Gvst <- vst(as.matrix(gcount_filt), blind=TRUE, fitType = "local")
## converting counts to integer mode
DEGvst <- Gvst[DEGs$gene_id,]
head(DEGvst) # View transformed gene count data
## Pacuta_ATAC_TP8_1051
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1 7.393249
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1 14.109410
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1 10.275293
## Pocillopora_acuta_KBHIv2___TS.g3591.t1 10.438549
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1 10.918501
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2 8.270558
## Pacuta_ATAC_TP8_1755
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1 6.738974
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1 13.139975
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1 8.237673
## Pocillopora_acuta_KBHIv2___TS.g3591.t1 9.244169
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1 9.565123
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2 8.257337
## Pacuta_ATAC_TP8_2012
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1 7.422574
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1 12.196555
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1 8.541758
## Pocillopora_acuta_KBHIv2___TS.g3591.t1 9.962511
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1 10.644623
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2 8.926995
## Pacuta_HTAC_TP8_1329
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1 5.988370
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1 8.136902
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1 7.076576
## Pocillopora_acuta_KBHIv2___TS.g3591.t1 7.244905
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1 7.730138
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2 6.349941
## Pacuta_HTAC_TP8_1765
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1 5.741367
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1 10.623002
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1 6.879131
## Pocillopora_acuta_KBHIv2___TS.g3591.t1 8.079371
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1 8.544964
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2 7.015737
## Pacuta_HTAC_TP8_2513
## Pocillopora_acuta_KBHIv2___RNAseq.g7091.t1 5.654871
## Pocillopora_acuta_KBHIv2___RNAseq.g9680.t1 9.702286
## Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1 6.444920
## Pocillopora_acuta_KBHIv2___TS.g3591.t1 6.644206
## Pocillopora_acuta_KBHIv2___RNAseq.g5755.t1 8.425329
## Pocillopora_acuta_KBHIv2___RNAseq.g29114.t2 6.100439
dim(DEGvst)
## [1] 25 6
Make a matrix for computing similarity
mat <- DEGvst # Make an expression object
mat <- mat - rowMeans(mat) # Difference in expression compared to average across all samples
Make a heatmap
hmTreatment <- subset(treatmentinfo, select=c(condition))
col <- list(); col[[condition]] = treatment.colors # Setup list for heatmap colors
hm_ann_col <- HeatmapAnnotation(df=hmTreatment, col = col) # Make dataframe for column naming
dend = cluster_within_group(mat, hmTreatment[[condition]])
DEGheatmap <- Heatmap(mat,
cluster_columns = dend,
column_split = 2,
name = "Gene expression (vst)",
show_row_names = F,
top_annotation = hm_ann_col,
show_column_names = F,
row_dend_side = "left",
column_dend_height = unit(0.5, "in"),
row_title_side = "right",
row_title_rot = 0,
row_dend_reorder = TRUE,
row_gap = unit(2.5, "mm"),
border = TRUE,
column_names_gp = gpar(fontsize = 10)
)
draw(DEGheatmap)
pca <- prcomp(t(mat)) # Calculate eigengenes
percentVar <- pca$sdev^2/sum(pca$sdev^2) # Save % variation by PC1 and PC2
d <- data.frame(treatmentinfo, PC1 = pca$x[, 1], PC2 = pca$x[, 2])
DEG_PCA <- ggplot(data = d, aes_string(x = "PC1", y = "PC2")) +
geom_point(size = 4, aes(colour=!!rlang::sym(condition))) +
xlab(paste0("PC1: ", round(percentVar[1] * 100), "% variance")) +
ylab(paste0("PC2: ", round(percentVar[2] * 100), "% variance")) +
coord_fixed() +
scale_color_manual(values = treatment.colors) +
theme_bw() + # Set background color
theme(panel.border = element_blank(), # Set border
panel.grid.major = element_blank(), # Set major gridlines
panel.grid.minor = element_blank(), # Set minor gridlines
axis.line = element_line(colour = "black", size = 0.6), # Set axes color
plot.background=element_blank(), # Set the plot background
axis.title = element_text(size = 14), # Axis title size
axis.text = element_blank()) # Axis text size and view plot
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
DEG_PCA